Set up options

Install new packages

# Install packages. Do this only once.
options(repos="https://cran.rstudio.com" )
install.packages("epiDisplay")
install.packages("Hmisc")
install.packages("ggplot2")
install.packages("GGally")
install.packages("vioplot")
install.packages("forestplot")

# To avoid installing every time: change set up in curly brackets to eval=FALSE

Load packages

Specify relative file directories

directory <- "/cloud/project" #Class option when coding on RStudio Cloud, update for your personal computer

Load data, remake useful variables

# Check the file path
file.path(directory, "nhanes3.rda")
## [1] "/cloud/project/nhanes3.rda"
# Load the saved R data
load(file.path(directory, "nhanes3.rda"))


# Remake a few variables from last class 
sex1 <- factor(nhanes$sex, levels = c(1, 2), labels = c("male", "female"))
AGE5b <- cut(nhanes$age, quantile(nhanes$age, c(0, .2, .4, .6, .8, 1)), include.lowest = T) # quintiles
AGE5c <- cut(nhanes$age, breaks = c(19, 40, 50, 60, 70, 90))
age5c <- unclass(AGE5c)

nhanes <- cbind(nhanes, sex1, AGE5b, AGE5c, age5c)
### Basic plotting functions

plot(nhanes$bmi)

plot(sort(nhanes$bmi))

summary(nhanes$bmi)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   14.40   23.20   26.40   27.24   30.12   68.50      10
plot(sort(nhanes$bmi), ylim = c(12, 70), xlab = "Rank", ylab = expression(paste("BMI in kg/m"^"2")), main = "Distribution of BMI in NHANES")

plot(nhanes$bmi, nhanes$chol, pch = 20, cex = 0.8, ylab = "Cholesterol in mg/dL", xlab = expression(paste("BMI in kg/m"^"2")), main = "Association between Cholesterol and BMI", las = 1, col = "dodgerblue")
rug(nhanes$bmi, side = 1)

# If you want your plot to appear in a separate window
# quartz() #mac only
# x11() #pc or mac (kind of slow)

# If you want to save your plot directly to file, use pdf(), png(), jpg(), etc. Close the plotting with dev.off()
pdf(file = file.path(directory, "BMI_plot.pdf"))
plot(nhanes$bmi, nhanes$chol, pch = 20, cex = 0.8, ylab = "Cholesterol in mg/dL", xlab = expression(paste("BMI in kg/m"^"2")), main = "Association between Cholesterol and BMI", las = 1, col = "dodgerblue")
rug(nhanes$bmi, side = 1)
dev.off()
## png 
##   2
# barplot, base graphics
table(nhanes$sex1)
## 
##   male female 
##   2335   2739
barplot(table(nhanes$sex1))

barplot(table(nhanes$sex1), col = c("dodgerblue", "firebrick1"), las = 1, ylab = "Frequency", cex.lab = 1.2)

# horizontal
barplot(table(nhanes$sex1), col = c(2, 4), horiz = T, las = 1, cex.names = 1.2, xlab = "Frequency", cex.lab = 1.2)

# barplots for subgroups
barplot(table(nhanes$smk, nhanes$sex1), las = 1)

# add legends
barplot(table(nhanes$smk, nhanes$sex1), col = c(2:4), legend = c("Never", "Former", "Current"), las = 1)

barplot(table(nhanes$smk, nhanes$sex1), col = c(2:4), legend.text = TRUE, args.legend = list(x = "topleft", legend = c("Never", "Former", "Current"), bty = "n", ncol = 3, cex = 0.7), las = 1)

barplot(table(nhanes$smk, nhanes$sex1), col = c(2:4), las = 1)
legend(x = "topleft", legend = c("Never", "Former", "Current"), bty = "n", ncol = 3, cex = 0.7, fill = c(2:4))

barplot(table(nhanes$smk, nhanes$sex1), col = c(2:4), legend = levels(nhanes$smk), beside = T)
legend(x = "topleft", legend = c("Never", "Former", "Current"), bty = "n", fill = c(2:4))

# y-axis as proportion
barplot(prop.table(table(nhanes$smk, nhanes$sex1)), las = 1, ylab = "Proportion of observations", cex.lab = 1.3)

barplot(prop.table(table(nhanes$smk, nhanes$sex1)), col = c(2:4), las = 1, ylab = "Proportion of observations", cex.lab = 1.3)

# barplot, ggplot2 package
ggplot(nhanes, aes(sex1)) + geom_bar()

ggplot(nhanes, aes(sex1, fill = sex1, color = sex1)) + geom_bar()

# horizontal
ggplot(nhanes, aes(sex1, fill = sex1, color = sex1)) + geom_bar() + coord_flip()

# Histogram, base graphics
hist(nhanes$age)

hist(nhanes$age, breaks = 4, col = "gray")

hist(nhanes$age, breaks = c(20, 30, 40, 50, 60, 70, 80, 90), col = "lightblue")

hist(nhanes$sbp, col = "dodgerblue", xlab = "Systolic Blood Pressure", main = "Blood Pressure")

hist(nhanes$sbp, col = "dodgerblue", xlab = "Systolic Blood Pressure", main = "Blood Pressure", breaks = seq(80, 240, by = 2))

# histogram with density
hist(nhanes$sbp, breaks = 20, col = "lightblue", border = "blue", freq = F)
lines(density(nhanes$sbp), lwd = 3)

# Overlapping histograms
tapply(nhanes$sbp, nhanes$sex1, summary)
## $male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    80.0   116.0   125.0   128.6   138.0   207.0 
## 
## $female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    83.0   109.0   119.0   124.3   137.0   237.0
hist(nhanes$sbp[nhanes$sex == 1], col = rgb(1, 0, 0, 0.5), xlim = c(80, 240), ylim = c(0, 150), main = "Overlapping Histogram", xlab = "SBP", breaks = 50, las = 1)
hist(nhanes$sbp[nhanes$sex == 2], col = rgb(0, 0, 1, 0.5), breaks = 100, add = T)
legend("topright", legend = c("Male", "Female"), fill = c(rgb(1, 0, 0, 0.5), rgb(0, 0, 1, 0.5)), bty = "n")

# overlapping densities
plot(density(nhanes$sbp[nhanes$sex == 1]),
  lwd = 4, col = "firebrick1", xlab = "Systolic Blood Pressure", las = 1,
  ylab = "Density", main = "Overlapping Densities", ylim = c(0, 0.03)
)
lines(density(nhanes$sbp[nhanes$sex == 2]), lwd = 4, col = "dodgerblue", lty = 2)
legend("topright", fill = c("firebrick1", "dodgerblue"), legend = c("Male", "Female"))

# Histogram, ggplot2 package
ggplot(nhanes, aes(age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(nhanes, aes(age)) + geom_histogram(binwidth = 10)

# add density plot
ggplot(nhanes, aes(sbp, ..density..)) + geom_histogram(fill = "lightgreen", color = "green", binwidth = 15) + geom_density()

# overlapping histograms
ggplot(nhanes, aes(sbp, fill = sex1)) + geom_histogram() + facet_grid(sex1 ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(nhanes, aes(sbp, fill = sex1)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(nhanes, aes(sbp, fill = sex1)) + geom_histogram() + scale_fill_grey(start = 0, end = 0.75)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Boxplot, base graphics
median(nhanes$bpb)
## [1] 3.2
quantile(nhanes$bpb)
##   0%  25%  50%  75% 100% 
##  0.7  1.9  3.2  5.0 48.1
quantile(nhanes$bpb, c(0.1, 0.9))
## 10% 90% 
## 1.1 7.5
boxplot(nhanes$bpb)
rug(nhanes$bpb, side = 2)

boxplot(log(nhanes$bpb), col = "lightgray")

# split boxes by another variable
table(nhanes$smk)
## 
##    1    2    3 
## 2539 1263 1272
boxplot(nhanes$bmi ~ nhanes$smk, col = c("seagreen", "dodgerblue", "darkorchid"), names = c("Never", "Former", "Current"), las = 1, ylab = "BMI", cex.lab = 1.3, xlab = "Smoking Status")

# options for boxes
boxplot(log(nhanes$bpb) ~ nhanes$AGE5b, col = 4, las = 1, ylab = "Log(blood lead level, ug/dL)", xlab = "Age category", cex.lab = 1.2)

boxplot(log(nhanes$bpb) ~ nhanes$AGE5b, notch = T, col = "yellow", las = 1, ylab = "Log(blood lead level, ug/dL)", xlab = "Age category", cex.lab = 1.2)

boxplot(log(nhanes$bpb) ~ nhanes$AGE5c, varwidth = T, col = "orange", las = 1, ylab = "Log(blood lead level, ug/dL)", xlab = "Age category", cex.lab = 1.2)

## draw widths proportional to sqrt(No of obs) in the groups
boxplot(log(nhanes$bpb) ~ nhanes$AGE5b, horizontal = T, col = "pink", las = 1, xlab = "Log (blood lead level, ug/dL)")

## group boxes by two variables
boxplot(log(nhanes$bpb) ~ nhanes$sex1 * nhanes$AGE5b, las = 2, ylab = "Log(blood lead level, ug/dL)", col = c("dodgerblue", "darkorchid"), cex.lab = 1.3, cex.axis = 0.7)

boxplot(log(nhanes$bpb) ~ nhanes$sex1 * nhanes$AGE5b, las = 2, ylab = "Log(blood lead level, ug/dL)", xlab="", col = c("dodgerblue", "darkorchid"), cex.lab = 1.3, xaxt = "n", ylim = c(-0.5, 4.5))
axis(1, at = seq(1.5, 10, 2), labels = levels(AGE5b))
title(xlab = "Age Category", cex.lab = 1.3)
legend("topright", c("Male", "Female"), fill = c("dodgerblue", "darkorchid"), cex = 0.7, ncol = 2)

# violin plot
vioplot(log(nhanes$bpb))
vioplot(log(nhanes$bpb[nhanes$smk == 1]), log(nhanes$bpb[nhanes$smk == 2]), log(nhanes$bpb[nhanes$smk == 3]), col = "dodgerblue", names = c("Never", "Former", "Current"))
title("Log(Blood Lead Level) by Smoking Status")
# Boxplot, ggplot2 package
ggplot(nhanes, aes(log(bpb))) + geom_boxplot(fill = "#990000",  color = "#3366FF", notch = T, notchwidth = .3)

# boxplot split by another variable
ggplot(nhanes, aes(AGE5b, log(bpb))) + geom_boxplot()

# boxplot split by two variables
ggplot(nhanes, aes(AGE5b, log(bpb))) + geom_boxplot(aes(fill = sex1)) + labs(title = "Boxplot of Blood Lead Levels") + scale_x_discrete("Age") + scale_y_continuous("Log(Blood lead levels)")

Exercise 5A

## Assuming that you obtained ORs for all and by gender and race.
## Let's create a data frame for these ORs and 95% CIs.

x.num <- c(1, 3, 4, 6, 7)
x1 <- c("All", "Male", "Female", "White", "Black")
or <- c(1.5, 1.1, 2.0, 1.4, 1.6)
or.ll <- c(1.2, 0.9, 1.65, 0.95, 1.3)
or.ul <- c(1.85, 1.35, 2.4, 2.05, 2.0)
results <- data.frame(x.num, x1, or, or.ll, or.ul)
# Forest plot, base graphics
plot(or, x.num, xlim = c(0, 2.5), ylim = c(0, 7), pch = 20, bty = "n", ylab = "", xaxt = "n", cex.lab = 1.5, cex = 3, col = "dodgerblue", yaxt = "n", xlab = "Odds Ratio")
for (i in 1:5) {
  lines(x = c(or.ll[i], or.ul[i]), y = c(x.num[i], x.num[i]), lwd = 2)
}
axis(1, cex.axis = 1.2)
axis(2, at = x.num, labels = x1, las = 1, lwd = 0, cex.axis = 1.2, pos = .3) # pos is x-axis location of labels
segments(1, 0, 1, 7.5, lty = 2)

# With forestplot package
results <- structure(list(
  mean = c(NA, or),
  lower = c(NA, or.ll),
  upper = c(NA, or.ul)
),
# .Names = c("mean", "lower", "upper")),
row.names = x1,
class = "data.frame"
)
tabletext <- cbind(c("Participant", x1), c("OR", or))
forestplot(tabletext,
  hrzl_lines = gpar(col = "#444444"),
  results, new_page = TRUE,
  is.summary = c(TRUE, rep(FALSE, 5)),
  clip = c(0.1, 3),
  xlog = FALSE,
  col = fpColors(box = "royalblue", line = "darkblue", summary = "royalblue")
)
## Warning in omit | x: longer object length is not a multiple of shorter object
## length

# add summary OR
forestplot(tabletext,
  results,
  new_page = TRUE,
  is.summary = c(TRUE, TRUE, rep(FALSE, 4)),
  clip = c(0.1, 3),
  xlog = FALSE,
  col = fpColors(box = "royalblue", line = "darkblue", summary = "royalblue")
)
## Warning in omit | x: longer object length is not a multiple of shorter object
## length

# Forest plot, ggplot2 package
ggplot(results, aes(y=x1, x=or, xmin=or.ll, xmax=or.ul)) + geom_pointrange() + xlab("Odds Ratio") + ylab("Group") + geom_vline(xintercept = 1, linetype = 2)  

# Scatterplot, base graphics
plot(nhanes$age, log(nhanes$bpb))

plot(nhanes$age, log(nhanes$bpb), xlab = "Age", ylab = "Log(Blood lead, ug/dL)", las = 1, cex.lab = 1.2, cex=0.2)
title("Scatterplot of Age vs. Blood Lead")
abline(lsfit(nhanes$age, log(nhanes$bpb)), col = "firebrick1", lwd = 3)

# identify(age,log(bpb))
# Tool to location spots on the plot
# locator()
## add a smoothing line
plot(nhanes$age, log(nhanes$bpb), xlab = "Age", ylab = "Log(Blood lead, ug/dL)", las = 1, cex.lab = 1.2, pch = 20, cex = 0.6)
title("Scatterplot of Age vs. Blood Lead")
lines(smooth.spline(nhanes$age, log(nhanes$bpb), df = 10), col = "dodgerblue", lwd = 3)

# plot(nhanes$age, nhanes$bmi, xlab = "Age", ylab = "Log(BMI)", las = 1, cex.lab = 1.2, pch = 20, cex = 0.6)
# title("Scatterplot of Age vs. BMI")
# lines(smooth.spline(nhanes$age, nhanes$bmi, df = 10), col = "dodgerblue", lwd = 3)
# smooth.spline() errors out with missing data

nomiss <- na.omit(data.frame(nhanes$age, nhanes$bmi)) # make an object without missing
dim(nomiss)
## [1] 5064    2
head(nomiss)
##   nhanes.age nhanes.bmi
## 1         21       25.5
## 2         35       29.4
## 3         44       44.4
## 4         48       37.5
## 5         66       23.6
## 6         63       31.1
plot(nomiss$nhanes.age, nomiss$nhanes.bmi, xlab = "Age", ylab = "Log(BMI)", las = 1, cex.lab = 1.2, pch = 20, cex = 0.6)
title("Scatterplot of Age vs. BMI")
lines(smooth.spline(nomiss$nhanes.age, nomiss$nhanes.bmi, df = 10), col = "dodgerblue", lwd = 3)

# or restrict to nonmissing within the smooth.spline function
plot(nhanes$age, nhanes$bmi, xlab = "Age", ylab = "Log(BMI)", las = 1, cex.lab = 1.2, pch = 20, cex = 0.6)
title("Scatterplot of Age vs. BMI")
lines(smooth.spline(nhanes$age[!is.na(nhanes$bmi)], na.omit(nhanes$bmi), df = 10), col = "dodgerblue", lwd = 3)

## Scatterplots by group
plot(nhanes$age, log(nhanes$bpb), xlab = "Age", ylab = "Log(Blood lead)", pch = nhanes$sex, col = c("seagreen", "darkorchid")[nhanes$sex], cex = 0.5, las = 1, cex.lab = 1.2)
abline(lsfit(nhanes$age[nhanes$sex == 1], log(nhanes$bpb)[nhanes$sex == 1]), lty = 1, lwd = 3, col = "seagreen")
abline(lsfit(nhanes$age[nhanes$sex == 2], log(nhanes$bpb)[nhanes$sex == 2]), lty = 1, lwd = 3, col = "darkorchid")
legend("topleft", c("Male", "Female"), lty = c(1, 1), pch = c(1, 2), col = c("seagreen", "darkorchid"))

# dealing with dense data overplotting
smoothScatter(nhanes$age, log(nhanes$bpb), ylab = "Log(Blood lead)", xlab = "Age", las = 1, cex.lab = 1.2)

# Scatterplot, ggplot2 package
ggplot(nhanes, aes(age, log(bpb))) + geom_point(shape = 1) # open circles

## add a linear fit
ggplot(nhanes, aes(age, log(bpb))) + geom_point() + geom_smooth(method = lm)
## `geom_smooth()` using formula 'y ~ x'

## add a smoothing line
ggplot(nhanes, aes(age, log(bpb))) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# split by another variable
ggplot(nhanes, aes(x=age, y=log(bpb), shape=sex1)) + geom_point(aes(color=sex1, shape=sex1), size = 0.8, alpha = 0.5) + geom_smooth(method=lm, aes(color=sex1))
## `geom_smooth()` using formula 'y ~ x'

# multiple graphs on one page
par(mfrow = c(1, 2)) # Here, make 1 row of graphs with 2 columns
plot(nhanes$age, log(nhanes$bpb), pch = 20, cex = 0.2, ylab = "Log(Blood Lead Level)", las = 1, xlab = "Age")
abline(lsfit(nhanes$age, log(nhanes$bpb)), col = "firebrick1", lwd = 2)
boxplot(split(log(nhanes$bpb), nhanes$AGE5b), notch = T, las = 2, ylab = "Log(Blood Lead Level)", pch = 20, cex = 0.5)

par(mfrow = c(1, 1)) # Go back to 1 row of graphs with 1 column of graphs
## In traditional graphics, just do the graphis in order and they find their place
## In grid, we must instead save the plot to an object
plot1 <- ggplot(nhanes, aes(sex1, fill = sex1, color = sex1)) + geom_bar()
plot2 <- ggplot(nhanes, aes(sbp, ..density..)) + geom_histogram(fill = "lightgreen", color = "green", binwidth = 15) + geom_density()
plot3 <- ggplot(nhanes, aes(AGE5b, log(bpb))) + geom_boxplot()
plot4 <- ggplot(nhanes, aes(age, log(bpb))) + geom_point(size=0.2, alpha=0.5) + geom_smooth()

## first clear the page with the grid.newpage() function.
## This is an important step. Otherwise plots printed using
## the following methods will appear on top of any previous plots.

grid.newpage()

## Next, use the pushViewport() function to define the various frames (viewports)
## in the grid graphic system

pushViewport(viewport(layout = grid.layout(2, 2)))

## and then use the print function to print the objects into the viewport.

print(plot1, vp = viewport(layout.pos.row = 1, layout.pos.col = 1))
print(plot2, vp = viewport(layout.pos.row = 1, layout.pos.col = 2))
print(plot3, vp = viewport(layout.pos.row = 2, layout.pos.col = 1))
print(plot4, vp = viewport(layout.pos.row = 2, layout.pos.col = 2))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Scatterplot matrix, base graphics

# pairs function, basic
pairs(cbind(nhanes$sbp, nhanes$bpb, nhanes$bmi, nhanes$age), pch = 20, cex = 0.4, col = "dodgerblue")

# Set up the function for the correlation coefficient text for the upper cells
panel.cor <- function(x, y, digits = 2, prefix = "r=", cex.cor, ...) {
  usr <- par("usr")
  on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y, use = "pairwise.complete.obs"))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if (missing(cex.cor)) cex.cor <- 0.8 / strwidth(txt)
  text(0.5, 0.5, txt, cex = 2)
}

# Set up the function for the histograms for the middle cells
panel.hist <- function(x, ...) {
  usr <- par("usr")
  on.exit(par(usr))
  par(usr = c(usr[1:2], 0, 1.5))
  h <- hist(x, plot = FALSE)
  breaks <- h$breaks
  nB <- length(breaks)
  y <- h$counts
  y <- y / max(y)
  rect(breaks[-nB], 0, breaks[-1], y, ...)
}

# Put it all together
pairs(cbind(nhanes$sbp, nhanes$bpb, nhanes$bmi, nhanes$age), main = "Pairwise Relationships", pch = 20, cex = 0.4, font.labels = 2, cex.labels = 2, col = "dodgerblue", upper.panel = panel.cor, diag.panel = panel.hist, lower.panel = panel.smooth)

# Scatterplot matrix, ggplot2 package
ggpairs(nhanes[, c("sbp", "dbp", "bmi")])
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 10 rows containing missing values
## Warning: Removed 10 rows containing missing values (geom_point).

## Warning: Removed 10 rows containing missing values (geom_point).
## Warning: Removed 10 rows containing non-finite values (stat_density).

Exercise 5B